package utd.pallet.classification;

import java.io.IOException;
import java.util.regex.Pattern;

import cc.mallet.pipe.AugmentableFeatureVectorLogScale;
import cc.mallet.pipe.CharSequence2TokenSequence;
import cc.mallet.pipe.CharSequenceRemoveHTML;
import cc.mallet.pipe.CharSequenceReplace;
import cc.mallet.pipe.FeatureSequence2AugmentableFeatureVector;
import cc.mallet.pipe.Input2CharSequence;
import cc.mallet.pipe.MakeAmpersandXMLFriendly;
import cc.mallet.pipe.Target2Label;
import cc.mallet.pipe.TokenSequence2FeatureSequence;
import cc.mallet.pipe.TokenSequenceRemoveStopwords;
import cc.mallet.pipe.iterator.ArrayIterator;

import utd.pallet.data.MalletDataImport;

/**
 * @author Pralabh
 * 
 */
public class DataImport {

    /**
     * This method demonstrates the processing of the data through different
     * Mallet pipes and shows the final processed Data. Please see the document
     * Data_Import_1 to understand the complete functionalities of the pipes
     * used here.
     * 
     * @param args
     * 
     */
    public static void main(String[] args)throws Exception {

        /**
         * Training data provided by the user.
         * 
         */

        String[][][] trainingdata = new String[][][] {
                {
                        { "on the plains of AFRICA the lions  roar",
                                "in swahili ngoma <HTML> to dance",
                                "nelson mandela is  president of  africa",
                                "the saraha dessert saraha expanding" },
                        { "africa" } },
                {
                        {
                                "panda bears eat bamboo",
                                "china's one child policy has resulted in a surplus of boys",
                                "tigers live in the jungle" }, { "asia" } },
                {
                        { "home of kangaroos", "autralian's for beer - Foster",
                                "steve IRVIN is a herpetologist" },
                        { "australia" } } };

        /**
         * An instance of Mallet_Data_Import class is created which, will be
         * used to add the different pipes to the pipelist.
         * 
         */

        MalletDataImport mc = new MalletDataImport();

        /**
         *Pattern is created to filter all the words which contain capital
         * letters.
         * 
         */

        Pattern tokenPattern = Pattern.compile("[\\p{Lu}_]+");

        /**
         *All the pipes are added which is used to processed the data. Please
         * read document Data_Import_1 to understand the complete working of
         * Pipes.
         * 
         */

        try {
            mc.createPipe(new Input2CharSequence("UTF-8"),
                    new CharSequenceReplace(tokenPattern, "Pallet"),
                    new CharSequenceRemoveHTML(),
                    new CharSequence2TokenSequence(),
                    new TokenSequenceRemoveStopwords(true, true),
                    new MakeAmpersandXMLFriendly(), true,
                    new TokenSequence2FeatureSequence(), new Target2Label(),
                    new FeatureSequence2AugmentableFeatureVector(false),
                    new AugmentableFeatureVectorLogScale(), true);
        } catch (IOException e) {
throw e;
        }

        /**
         * Adds all the instances generated by the iterator to the Instance
         * List.
         * 
         */

        for (int i = 0; i < 3; i++) {
            try {
                mc.addThruPipe(new ArrayIterator(trainingdata[i][0],
                        trainingdata[i][1][0]));
            } catch (Exception e) {
            	e.printStackTrace();
            }
        }

    }
}
